import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.sparse as sparse
from scipy.sparse.linalg import svds
import warnings
warnings.filterwarnings('ignore')

# Load the dataset
df = pd.read_csv('ratings_Electronics.csv', names=['user_id', 'product_id', 'rating', 'timestamp'])

# Display the first 5 rows
df.head()

# Shape of the dataset (rows, columns)
print("Dataset shape:", df.shape)

# Data types and non-null counts
print("\nData Info:")
print(df.info())

# Check for missing values
print("\nMissing values:")
print(df.isnull().sum())

Dataset shape: (7824482, 4)

Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7824482 entries, 0 to 7824481
Data columns (total 4 columns):
 #   Column      Dtype  
---  ------      -----  
 0   user_id     object 
 1   product_id  object 
 2   rating      float64
 3   timestamp   int64  
dtypes: float64(1), int64(1), object(2)
memory usage: 238.8+ MB
None

Missing values:
user_id       0
product_id    0
rating        0
timestamp     0
dtype: int64

# Distribution of rating values
rating_counts = df['rating'].value_counts().sort_index()

# Plot the rating distribution
plt.figure(figsize=(8, 5))
sns.barplot(x=rating_counts.index, y=rating_counts.values, palette='Blues_d')
plt.title('Rating Distribution')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.grid(axis='y')
plt.show()

# Also print counts
print(rating_counts)

rating
1.0     901765
2.0     456322
3.0     633073
4.0    1485781
5.0    4347541
Name: count, dtype: int64

# Filter the dataset for ratings >= 4.0
df_filtered = df[df['rating'] >= 4.0]

# Check the new shape and a preview
print("Filtered dataset shape:", df_filtered.shape)
df_filtered.head()

Filtered dataset shape: (5833322, 4)

# Number of unique users and products
num_users = df_filtered['user_id'].nunique()
num_products = df_filtered['product_id'].nunique()

print(f"Number of unique users: {num_users}")
print(f"Number of unique products: {num_products}")

# Count of ratings per user
user_activity = df_filtered['user_id'].value_counts()

# Count of ratings per product
product_popularity = df_filtered['product_id'].value_counts()

# Show the top 5 most active users and most rated products
print("\nTop 5 most active users:")
print(user_activity.head())

print("\nTop 5 most rated products:")
print(product_popularity.head())

Number of unique users: 3256144
Number of unique products: 410110

Top 5 most active users:
user_id
A3OXHLG6DIBRW8    464
ADLVFFE4VBT8      414
A5JLAU2ARJ0BO     358
A1ODOGXEYECQQ8    346
A6FIAB28IS79      340
Name: count, dtype: int64

Top 5 most rated products:
product_id
B0074BW614    16098
B007WTAJTO    12244
B0019EHU8G    11640
B00DR0PDNE    11604
B006GWO5WK    10048
Name: count, dtype: int64

# Keep users who have rated at least 50 products
active_users = user_activity[user_activity >= 50].index
df_filtered = df_filtered[df_filtered['user_id'].isin(active_users)]

# Recalculate product counts after user filtering
product_popularity = df_filtered['product_id'].value_counts()

# Keep products that have been rated at least 50 times
popular_products = product_popularity[product_popularity >= 50].index
df_filtered = df_filtered[df_filtered['product_id'].isin(popular_products)]

# Check updated shape
print("Filtered dataset shape after activity thresholds:", df_filtered.shape)

# Optional: Preview
df_filtered.head()

Filtered dataset shape after activity thresholds: (2770, 4)

# Create a pivot table (user-item matrix)
user_item_matrix = df_filtered.pivot_table(
    index='user_id',
    columns='product_id',
    values='rating'
)

# Fill NaNs with 0 (optional, only if using models that expect filled matrix)
# user_item_matrix = user_item_matrix.fillna(0)

# Display shape and a preview
print("User-Item Matrix shape:", user_item_matrix.shape)
user_item_matrix.head()

User-Item Matrix shape: (828, 38)

# Fill NaNs with 0 for SVD
matrix_filled = user_item_matrix.fillna(0)

# Convert to numpy array
R = matrix_filled.values

# Number of latent factors
k = 15

# Apply SVD
U, sigma, Vt = svds(R, k=k)

# Convert sigma (1D) into a diagonal matrix
sigma = np.diag(sigma)

# Reconstruct the ratings matrix
predicted_ratings = np.dot(np.dot(U, sigma), Vt)

# Convert back to DataFrame for easier handling
predicted_df = pd.DataFrame(predicted_ratings, index=user_item_matrix.index, columns=user_item_matrix.columns)

# Preview
predicted_df.head()

def get_top_n_recommendations(predictions_df, original_df, user_id, n=5):
    """
    Returns top-N product recommendations for a given user based on predicted ratings.
    
    Parameters:
        predictions_df (DataFrame): SVD-predicted ratings matrix
        original_df (DataFrame): Original filtered user-item ratings
        user_id (str): The user for whom to generate recommendations
        n (int): Number of top recommendations to return

    Returns:
        DataFrame: Top-N recommended products with predicted ratings
    """
    # Get products already rated by the user
    rated_products = original_df[original_df['user_id'] == user_id]['product_id'].tolist()

    # Get predicted ratings for the user, sort by highest predicted score
    user_predictions = predictions_df.loc[user_id].drop(rated_products)
    top_n = user_predictions.sort_values(ascending=False).head(n)

    return top_n.reset_index().rename(columns={user_id: 'predicted_rating'})

# Example: Top 5 recommendations for one user
example_user = predicted_df.index[0]
get_top_n_recommendations(predicted_df, df_filtered, example_user, n=5)

	user_id	product_id	rating	timestamp
0	AKM1MP6P0OYPR	0132793040	5.0	1365811200
1	A2CX7LUOHB2NDG	0321732944	5.0	1341100800
2	A2NWSAGRHCP8N5	0439886341	1.0	1367193600
3	A2WNBOD3WNDNKT	0439886341	3.0	1374451200
4	A1GI0U4ZRJA8WN	0439886341	1.0	1334707200

	user_id	product_id	rating	timestamp
0	AKM1MP6P0OYPR	0132793040	5.0	1365811200
1	A2CX7LUOHB2NDG	0321732944	5.0	1341100800
5	A1QGNMC6O1VW39	0511189877	5.0	1397433600
7	A2TY0BTJOTENPG	0511189877	5.0	1395878400
8	A34ATBPOK6HCHY	0511189877	5.0	1395532800

	user_id	product_id	rating	timestamp
483958	ADLVFFE4VBT8	B0002L5R78	5.0	1229212800
484344	A3G5MOHY1U635N	B0002L5R78	5.0	1362009600
484352	A19W47CXJJP1MI	B0002L5R78	5.0	1323129600
484410	A12DQZKRKTNF5E	B0002L5R78	5.0	1325116800
485096	A25UZ7MA72SMKM	B0002L5R78	5.0	1276732800

product_id	B0002L5R78	B000JMJWV2	B000LRMS66	B000N99BBC	B000QUUFRW	B000VX6XL6	B0019EHU8G	B001E1Y5O6	B001TH7GUU	B002R5AM7C	...	B00829TIEK	B0082E9K7U	B00834SJNA	B00834SJSK	B0088CJT4U	B008DWCRQW	B009SYZ8OC	B00BOHNYTW	B00G4UQ6U8	B00HFRWWAM
user_id
A100UD67AHFODS	0.197361	0.216765	0.427702	0.063351	0.363190	0.495266	0.018857	0.328507	-0.245672	-0.011941	...	0.280657	1.814255	-0.223170	-0.466789	-0.012076	0.181163	0.084884	-0.225589	1.647371	0.144609
A100WO06OQR8BQ	0.772218	0.342182	0.195361	0.472087	-0.826224	0.967543	0.399888	0.456739	0.499931	3.059293	...	0.181453	0.850221	0.457965	-0.024183	5.572918	3.233693	0.810372	-0.100217	0.460520	0.641099
A10AFVU66A79Y1	-0.055168	0.251909	0.380853	-0.248796	1.165782	-0.017622	0.043914	0.567286	0.298237	0.443113	...	-0.160385	0.185604	0.002711	0.088393	-0.090563	-0.042144	0.219962	0.269924	0.026083	0.104036
A10NMELR4KX0J6	-0.283842	-0.417997	-0.253220	0.402361	0.339536	-0.215334	-0.307922	0.147431	-0.003668	-0.874813	...	-0.092343	0.358711	-0.083676	0.427923	0.548679	-0.029192	-0.052559	0.413422	0.320292	0.109375
A10O7THJ2O20AG	-0.076884	0.431264	0.733881	0.464186	2.779335	0.286990	1.177197	0.001412	0.573502	-1.479514	...	0.600708	0.395454	-0.016110	-0.624510	0.187761	-0.670853	0.457595	0.580788	0.756217	-0.174561

	product_id	predicted_rating
0	B00G4UQ6U8	1.647371
1	B0079UAT0A	1.169006
2	B002V88HFE	0.708071
3	B0041Q38NU	0.617315
4	B000VX6XL6	0.495266

Project: Amazon Product Recommendation System¶

Load and Preview Dataset¶

Observation¶

Dataset Overview¶

Observation¶

Rating Distribution¶

Observation¶

Why Filter for Ratings ≥ 4.0?¶

1. Focus on Positive Preferences¶

2. Mimicking Implicit Feedback¶

3. Reducing Sparsity and Noise¶

4. Improved Interpretability and Performance¶

5. When Not to Filter¶

Filter for Positive Ratings (≥ 4.0)¶

Observation¶

Analyze User and Product Activity¶

Observation¶

Why Filter Low-Activity Users and Rarely-Rated Products?¶

Why Filter?¶

Strategy¶

Filter for Active Users and Popular Products¶

Observation¶

Create the User-Item Matrix¶

Observation¶

Apply Matrix Factorization using SVD¶

Observation¶

Generate Top-N Recommendations¶

Observation¶

📘 Summary and Next Steps¶

✅ What We Accomplished¶

🧠 Skills Demonstrated¶

🚀 Potential Next Steps¶

product_id	B0002L5R78	B000JMJWV2	B000LRMS66	B000N99BBC	B000QUUFRW	B000VX6XL6	B0019EHU8G	B001E1Y5O6	B001TH7GUU	B002R5AM7C	...	B00829TIEK	B0082E9K7U	B00834SJNA	B00834SJSK	B0088CJT4U	B008DWCRQW	B009SYZ8OC	B00BOHNYTW	B00G4UQ6U8	B00HFRWWAM
user_id
A100UD67AHFODS	NaN	NaN	5.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	5.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
A100WO06OQR8BQ	NaN	NaN	NaN	NaN	NaN	5.0	NaN	NaN	NaN	5.0	...	NaN	NaN	NaN	NaN	4.0	5.0	NaN	NaN	NaN	NaN
A10AFVU66A79Y1	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
A10NMELR4KX0J6	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
A10O7THJ2O20AG	NaN	NaN	NaN	NaN	5.0	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN